In this RMarkdown we will overview how to run local adaptation analysis with R.Sambada
library(tidyverse)
library(colorout)
library(here)
library(ggplot2)
# library(Biostrings)
library(scales)
# library(reticulate)
# library(flextable)
# library(officer)
# library(rmdformats)
library(extrafont)
library(forcats)
library(ggrepel)
library(ggtext)
# library(R.SamBada)
library(rgdal)
library(stats)
library(geosphere)
library(vcfR)
library(OutFLANK)
library(ggvenn)This tool does not directly account for population structure. It instead looks for loci that show a higher Fst than the neutral expectation, under a model of hierarchical population structure. The method is robust to hierarchical population structure and a range of demographic histories.
Check the documentation https://htmlpreview.github.io/?https://github.com/whitlock/OutFLANK/blob/master/inst/doc/OutFLANKAnalysis.html
“OutFLANK is an R package that implements the method developed by Whitlock and Lotterhos (2015) to use likelihood on a trimmed distribution of FST values to infer the distribution of FST for neutral markers. This distribution is then used to assign q-values to each locus to detect outliers that may be due to spatially heterogeneous selection.”
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2524426 134.9 4700540 251.1 NA 4700540 251.1
## Vcells 4290719 32.8 10146329 77.5 32768 7397779 56.5
Check vignettes
## starting httpd help server ... done
Extract the intergenic SNPs and do LD pruning
plink2 \
--bfile output/quality_control/file7 \
--extract data/files/intergenic_SNPs.txt \
--indep-pairwise 5 1 0.1 \
--out output/outflank/indepSNP \
--silent;
grep 'samples\|variants\|remaining' output/outflank/indepSNP.log## 60 samples (24 females, 26 males, 10 ambiguous; 60 founders) loaded from
## 110353 variants loaded from output/quality_control/file7.bim.
## --extract: 12112 variants remaining.
## 12112 variants remaining after main filters.
## --indep-pairwise (3 compute threads): 6694/12112 variants removed.
Create vcf with intergenic SNPs that are not linked
plink2 \
--bfile output/quality_control/file7 \
--extract output/outflank/indepSNP.prune.in \
--export vcf \
--make-bed \
--maf 0.1 \
--geno 0.2 \
--out output/outflank/intergenic \
--silent;
grep 'samples\|variants\|remaining' output/outflank/intergenic.log## 60 samples (24 females, 26 males, 10 ambiguous; 60 founders) loaded from
## 110353 variants loaded from output/quality_control/file7.bim.
## --extract: 5418 variants remaining.
## --geno: 0 variants removed due to missing genotype data.
## 930 variants removed due to allele frequency threshold(s)
## 4488 variants remaining after main filters.
We need to import the data
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 4488
## column count: 69
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 4488
## Character matrix gt cols: 69
## skip: 0
## nrows: 4488
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant: 4488
## All variants processed
# Extract genotypes
geno <- vcfR::extract.gt(vcf)
# Extract locus names (positions)
locusNames <- vcfR::getPOS(vcf)
# Samples
sampleNames <- colnames(geno)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames <- sapply(strsplit(sampleNames, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))
# Recode genotypes
G[geno %in% c("0/0", "0|0")] <- 0
G[geno %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
G <- replace(G, is.na(G), 9)
# Check the counts of each genotype
table(as.vector(G))##
## 0 1 2 9
## 108693 89965 64537 6085
# Extract SNP names
snpNames <- vcfR::getID(vcf)
# Calculate FST with SNP names
my_fst <- MakeDiploidFSTMat(t(G), locusNames = snpNames, popNames = popNames)## Calculating FSTs, may take a few minutes...
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-583034838 0.2777778 0.052771937 0.007541076 0.1428994 0.07847402
## 2 AX-583037866 0.4304797 0.138129142 0.031681255 0.2293597 0.16400883
## 3 AX-583038481 0.2777778 0.058334618 0.008344722 0.1430492 0.08171790
## 4 AX-583043989 0.4240161 -0.009452584 -0.002013646 0.2130260 0.01731117
## 5 AX-583044062 0.4998611 0.166640378 0.045020527 0.2701658 0.20345029
## 6 AX-583045992 0.4911111 -0.020229506 -0.004977314 0.2460423 0.01092545
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.011234174 0.1431579 0.1666667
## 2 0.037697228 0.2298488 0.3135593
## 3 0.011705660 0.1432447 0.8333333
## 4 0.003694230 0.2134015 0.6949153
## 5 0.055108982 0.2708720 0.5083333
## 6 0.002693991 0.2465793 0.4333333
Check the FST distribution
# Create the histogram of FST values
ggplot(my_fst, aes(x=FST)) +
geom_histogram(binwidth=0.001, fill="blue", alpha=0.7) +
labs(title="Histogram of FST Values",
x="FST",
y="Frequency") +
theme_minimal()We can select the SNPs with low FST to use as neutral SNPs
my_fst |>
filter(FST < 0.2 & FST >=-0.1) |>
dplyr::select(LocusName) -> neutral_snps
length(neutral_snps$LocusName)## [1] 3731
write.table(
neutral_snps$LocusName,
file = here("output", "outflank","neutral_SNPs.txt"),
row.names = FALSE,
quote = FALSE,
col.names = FALSE,
sep = "\n"
)Create a vcf
plink2 \
--bfile output/outflank/intergenic \
--export vcf \
--extract output/outflank/neutral_SNPs.txt \
--out output/outflank/neutral \
--silent;
grep "samples\|variants" output/outflank/neutral.log ## 60 samples (24 females, 26 males, 10 ambiguous; 60 founders) loaded from
## 4488 variants loaded from output/outflank/intergenic.bim.
## --extract: 3731 variants remaining.
## 3731 variants remaining after main filters.
Import the data.
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 3731
## column count: 69
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 3731
## Character matrix gt cols: 69
## skip: 0
## nrows: 3731
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant: 3731
## All variants processed
# vcf <- read.vcfR(here("output","outflank","intergenic.vcf"))
# Extract genotypes
geno <- vcfR::extract.gt(vcf)
# Extract locus names (positions)
locusNames <- vcfR::getPOS(vcf)
# Samples
sampleNames <- colnames(geno)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames <- sapply(strsplit(sampleNames, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))
# Recode genotypes
G[geno %in% c("0/0", "0|0")] <- 0
G[geno %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
G <- replace(G, is.na(G), 9)
# Check the counts of each genotype
table(as.vector(G))##
## 0 1 2 9
## 89218 78306 51396 4940
# Extract SNP names
snpNames <- vcfR::getID(vcf)
# Calculate FST with SNP names
my_fst <- MakeDiploidFSTMat(t(G), locusNames = snpNames, popNames = popNames)## Calculating FSTs, may take a few minutes...
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-583034838 0.2777778 0.052771937 0.007541076 0.1428994 0.07847402
## 2 AX-583037866 0.4304797 0.138129142 0.031681255 0.2293597 0.16400883
## 3 AX-583038481 0.2777778 0.058334618 0.008344722 0.1430492 0.08171790
## 4 AX-583043989 0.4240161 -0.009452584 -0.002013646 0.2130260 0.01731117
## 5 AX-583044062 0.4998611 0.166640378 0.045020527 0.2701658 0.20345029
## 6 AX-583045992 0.4911111 -0.020229506 -0.004977314 0.2460423 0.01092545
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.011234174 0.1431579 0.1666667
## 2 0.037697228 0.2298488 0.3135593
## 3 0.011705660 0.1432447 0.8333333
## 4 0.003694230 0.2134015 0.6949153
## 5 0.055108982 0.2708720 0.5083333
## 6 0.002693991 0.2465793 0.4333333
Check the FST distribution
# Create the histogram of FST values
ggplot(my_fst, aes(x=FST)) +
geom_histogram(binwidth=0.001, fill="blue", alpha=0.7) +
labs(title="Histogram of FST Values",
x="FST",
y="Frequency") +
theme_minimal()Data prep: decide which SNPs to use for calibrating the null distribution of Fst
# Run OutFLANK to estimate the neutral FST distribution
out_trim <- OutFLANK(FstDataFrame = my_fst, Hmin = 0.1, NumberOfSamples = 60, qthreshold = 0.05)
str(out_trim)## List of 6
## $ FSTbar : num 0.0516
## $ FSTNoCorrbar : num 0.0809
## $ dfInferred : num 2.48
## $ numberLowFstOutliers : int 0
## $ numberHighFstOutliers: int 0
## $ results :'data.frame': 3731 obs. of 15 variables:
## ..$ LocusName : chr [1:3731] "AX-583034838" "AX-583037866" "AX-583038481" "AX-583043989" ...
## ..$ He : num [1:3731] 0.278 0.43 0.278 0.424 0.5 ...
## ..$ FST : num [1:3731] 0.05277 0.13813 0.05833 -0.00945 0.16664 ...
## ..$ T1 : num [1:3731] 0.00754 0.03168 0.00834 -0.00201 0.04502 ...
## ..$ T2 : num [1:3731] 0.143 0.229 0.143 0.213 0.27 ...
## ..$ FSTNoCorr : num [1:3731] 0.0785 0.164 0.0817 0.0173 0.2035 ...
## ..$ T1NoCorr : num [1:3731] 0.01123 0.0377 0.01171 0.00369 0.05511 ...
## ..$ T2NoCorr : num [1:3731] 0.143 0.23 0.143 0.213 0.271 ...
## ..$ meanAlleleFreq : num [1:3731] 0.167 0.314 0.833 0.695 0.508 ...
## ..$ indexOrder : int [1:3731] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ GoodH : chr [1:3731] "goodH" "goodH" "goodH" "goodH" ...
## ..$ qvalues : num [1:3731] 0.83 0.772 0.822 0.926 0.772 ...
## ..$ pvalues : num [1:3731] 0.787 0.239 0.753 0.296 0.136 ...
## ..$ pvaluesRightTail: num [1:3731] 0.3935 0.1196 0.3767 0.8518 0.0679 ...
## ..$ OutlierFlag : logi [1:3731] FALSE FALSE FALSE FALSE FALSE FALSE ...
Check the fit and make sure it looks good, especially in the right tail:
OutFLANKResultsPlotter(
out_trim,
withOutliers = TRUE,
NoCorr = TRUE,
Hmin = 0.1,
binwidth = 0.001,
Zoom = FALSE,
RightZoomFraction = 0.05,
titletext = NULL
)## Zoom in on right tail
OutFLANKResultsPlotter(
out_trim ,
withOutliers = TRUE,
NoCorr = TRUE,
Hmin = 0.1,
binwidth = 0.001,
Zoom =
TRUE,
RightZoomFraction = 0.05,
titletext = NULL
)Also check the P-value histogram: Here, we plot the “right-tailed” P-values, which means that outliers in the right tail of the FST distribution will have a P-value near zero. Because we ran the algorithm on a trimmed set of SNPs, this will remove some of the signal around selected sites. So we expect this histogram to be flat and maybe have a bump near 0 for selected sites. This histogram looks pretty good.
Now that we’ve estimated neutral mean FST and df to a quasi-independent set of SNPs, we can go and calculate P-values and q-values for all the loci in our dataset before prunning.
Note that it is important to run this code with the uncorrected FSTs (FSTNoCorr) and the uncorrected mean FST (FSTNoCorrbar).
Now we can use the entire data set to estimate the fst
Create a vcf - ld pruned data
plink2 \
--bfile output/quality_control/file7 \
--export vcf \
--extract output/quality_control/indepSNP.prune.in \
--make-bed \
--out output/outflank/autogenous \
--silent;
grep "samples\|variants" output/outflank/autogenous.log ## 60 samples (24 females, 26 males, 10 ambiguous; 60 founders) loaded from
## 110353 variants loaded from output/quality_control/file7.bim.
## --extract: 41620 variants remaining.
## 41620 variants remaining after main filters.
Import the data
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 41620
## column count: 69
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 41620
## Character matrix gt cols: 69
## skip: 0
## nrows: 41620
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant: 41620
## All variants processed
# Extract genotypes
geno2 <- vcfR::extract.gt(vcf2)
# Extract locus names (positions)
locusNames2 <- vcfR::getPOS(vcf2)
# Samples
sampleNames2 <- colnames(geno2)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames2 <- sapply(strsplit(sampleNames2, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
H <- matrix(NA, nrow = nrow(geno2), ncol = ncol(geno2))
# Recode genotypes
H[geno2 %in% c("0/0", "0|0")] <- 0
H[geno2 %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
H[geno2 %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
H <- replace(H, is.na(H), 9)
# Check the counts of each genotype
table(as.vector(H))##
## 0 1 2 9
## 1192622 654780 580281 69517
# Extract SNP names
snpNames <- vcfR::getID(vcf2)
# Use the option below for names if you want to make the plot without having to add the postion later on
locusNames2 <- vcfR::getPOS(vcf2)
# Calculate FST
my_fst2 <- MakeDiploidFSTMat(t(H), locusNames = locusNames2, popNames = popNames2)## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 41620"
## [1] "20000 done of 41620"
## [1] "30000 done of 41620"
## [1] "40000 done of 41620"
my_fst3 <- MakeDiploidFSTMat(t(H), locusNames = snpNames, popNames = popNames2) # with SNP id instead of position## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 41620"
## [1] "20000 done of 41620"
## [1] "30000 done of 41620"
## [1] "40000 done of 41620"
# Concatenate the SNP names and the SNP positions
locusNames3 <- paste(snpNames, locusNames2, sep = ".")
# Calculate FST with the combined names
my_fst4 <- MakeDiploidFSTMat(t(H), locusNames = locusNames3, popNames = popNames2)## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 41620"
## [1] "20000 done of 41620"
## [1] "30000 done of 41620"
## [1] "40000 done of 41620"
## LocusName He FST T1 T2 FSTNoCorr
## 1 97856 0.21224732 0.174406212 0.0199494949 0.1143852 0.19086232
## 2 305518 0.05305785 -0.020578060 -0.0005465327 0.0265590 0.00977961
## 3 308124 0.40035077 0.020281534 0.0041262477 0.2034485 0.04734565
## 4 315059 0.24319444 -0.001759818 -0.0002153631 0.1223781 0.02094621
## 5 315386 0.47277778 0.031058420 0.0074908526 0.2411859 0.05704139
## 6 330908 0.49020000 -0.027900918 -0.0068662477 0.2460940 0.01822573
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.0218599034 0.1145323 0.8793103
## 2 0.0002604437 0.0266313 0.9727273
## 3 0.0096487441 0.2037937 0.7232143
## 4 0.0025674370 0.1225728 0.8583333
## 5 0.0137827003 0.2416263 0.6166667
## 6 0.0044977511 0.2467804 0.4300000
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-581444870 0.21224732 0.174406212 0.0199494949 0.1143852 0.19086232
## 2 AX-583035083 0.05305785 -0.020578060 -0.0005465327 0.0265590 0.00977961
## 3 AX-583035102 0.40035077 0.020281534 0.0041262477 0.2034485 0.04734565
## 4 AX-583033342 0.24319444 -0.001759818 -0.0002153631 0.1223781 0.02094621
## 5 AX-583035163 0.47277778 0.031058420 0.0074908526 0.2411859 0.05704139
## 6 AX-583035198 0.49020000 -0.027900918 -0.0068662477 0.2460940 0.01822573
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.0218599034 0.1145323 0.8793103
## 2 0.0002604437 0.0266313 0.9727273
## 3 0.0096487441 0.2037937 0.7232143
## 4 0.0025674370 0.1225728 0.8583333
## 5 0.0137827003 0.2416263 0.6166667
## 6 0.0044977511 0.2467804 0.4300000
## LocusName He FST T1 T2
## 1 AX-581444870.97856 0.21224732 0.174406212 0.0199494949 0.1143852
## 2 AX-583035083.305518 0.05305785 -0.020578060 -0.0005465327 0.0265590
## 3 AX-583035102.308124 0.40035077 0.020281534 0.0041262477 0.2034485
## 4 AX-583033342.315059 0.24319444 -0.001759818 -0.0002153631 0.1223781
## 5 AX-583035163.315386 0.47277778 0.031058420 0.0074908526 0.2411859
## 6 AX-583035198.330908 0.49020000 -0.027900918 -0.0068662477 0.2460940
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.19086232 0.0218599034 0.1145323 0.8793103
## 2 0.00977961 0.0002604437 0.0266313 0.9727273
## 3 0.04734565 0.0096487441 0.2037937 0.7232143
## 4 0.02094621 0.0025674370 0.1225728 0.8583333
## 5 0.05704139 0.0137827003 0.2416263 0.6166667
## 6 0.01822573 0.0044977511 0.2467804 0.4300000
Data checks: Heterozygosity vs. FST Here, you can see how some of the low H loci have high FST. These are all neutral loci in the simulation, and it is important to exclude them from the OutFLANK algorithm.
Note that it is important to run this code with the uncorrected FSTs (FSTNoCorr) and the uncorrected mean FST (FSTNoCorrbar).
P1 <-
pOutlierFinderChiSqNoCorr(
my_fst4,
Fstbar = out_trim$FSTNoCorrbar,
dfInferred = out_trim$dfInferred,
qthreshold = 0.05,
Hmin = 0.1
)
head(P1)## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-581444870.97856 0.2122473 0.174406212 0.0199494949 0.1143852 0.19086232
## 3 AX-583035102.308124 0.4003508 0.020281534 0.0041262477 0.2034485 0.04734565
## 4 AX-583033342.315059 0.2431944 -0.001759818 -0.0002153631 0.1223781 0.02094621
## 5 AX-583035163.315386 0.4727778 0.031058420 0.0074908526 0.2411859 0.05704139
## 6 AX-583035198.330908 0.4902000 -0.027900918 -0.0068662477 0.2460940 0.01822573
## 7 AX-583035257.442875 0.4800000 0.243941554 0.0649819196 0.2663831 0.26402886
## T1NoCorr T2NoCorr meanAlleleFreq pvalues pvaluesRightTail qvalues
## 1 0.021859903 0.1145323 0.8793103 0.16283854 0.08141927 0.2521709
## 3 0.009648744 0.2037937 0.7232143 0.81471287 0.59264357 0.5966483
## 4 0.002567437 0.1225728 0.8583333 0.36463355 0.81768322 0.6812806
## 5 0.013782700 0.2416263 0.6166667 0.95413500 0.52293250 0.5646682
## 6 0.004497751 0.2467804 0.4300000 0.31369326 0.84315337 0.6888662
## 7 0.070433599 0.2667648 0.4000000 0.05639507 0.02819754 0.1595275
## OutlierFlag
## 1 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 FALSE
## LocusName He FST T1 T2
## 41531 AX-583028201.487250892 0.09500000 0.225057093 0.0117880846 0.05237820
## 41554 AX-583030779.487439328 0.03702385 0.007209959 0.0001350837 0.01873571
## 41573 AX-583032255.487617812 0.03332376 0.020057778 0.0003395994 0.01693106
## 41578 AX-583032695.487694040 0.06444444 0.025512528 0.0008403217 0.03293761
## 41615 AX-583032854.488822384 0.05038644 0.023626087 0.0006092546 0.02578737
## 41620 AX-583034980.489339661 0.09809750 0.047236453 0.0023857904 0.05050740
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq pvalues
## 41531 0.24918491 0.0130742913 0.05246823 0.0500000 NA
## 41554 0.03674133 0.0006896880 0.01877145 0.9811321 NA
## 41573 0.04649132 0.0007888453 0.01696758 0.9830508 NA
## 41578 0.06412786 0.0021179537 0.03302704 0.9666667 NA
## 41615 0.06873395 0.0017786561 0.02587740 0.9741379 NA
## 41620 0.08146709 0.0041263671 0.05065072 0.9482759 NA
## pvaluesRightTail qvalues OutlierFlag
## 41531 NA NA NA
## 41554 NA NA NA
## 41573 NA NA NA
## 41578 NA NA NA
## 41615 NA NA NA
## 41620 NA NA NA
# notice how the output is ordered differently
my_out <- P1$OutlierFlag==TRUE
plot(P1$He, P1$FST, pch=19, col=rgb(0,0,0,0.1))
points(P1$He[my_out], P1$FST[my_out], col="blue")# check the P-value histogram for the full set of data
# if there are outliers, it should look flat with an inflation near 0Highlight outliers on Manhattan Plot
For publication, we want to show the accurate estimate of FST, not the uncorrected estimate. Remember to exclude those low H loci!
# Use the separate() function to split the LocusName column
P1 <- P1 |>
separate(LocusName, into = c("SNP_id", "LocusName"), sep = "\\.")
# Create plot
plot(P1$LocusName[P1$He>0.1], P1$FST[P1$He>0.1],
xlab="Position", ylab="FST", col=rgb(0,0,0,0.2))
points(P1$LocusName[my_out], P1$FST[my_out], col="magenta", pch=20)
output/local_adaptation/file3 We can use our bim file to create a plot
with the chromosomal ids
# ____________________________________________________________________________
# import the bim file with the SNP data ####
snps <-
read_delim( # to learn about the options use here, run ?read_delim on the console.
here(
"output", "outflank", "autogenous.bim"
), # use library here to load it
col_names = FALSE, # we don't have header in the input file
show_col_types = FALSE, # suppress message from read_delim
col_types = "ccidcc" # set the class of each column
)
#
# set column names
colnames(
snps
) <- # to add a header in our tibble
c(
"Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
)
#
# check the tibble
head(snps)## # A tibble: 6 × 6
## Scaffold SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-581444870 0 97856 C T
## 2 1 AX-583035083 0 305518 A G
## 3 1 AX-583035102 0 308124 A G
## 4 1 AX-583033342 0 315059 C G
## 5 1 AX-583035163 0 315386 A G
## 6 1 AX-583035198 0 330908 G T
Now we can merge it with my_fst3
# Use the separate() function to split the LocusName column
my_fst4 <- my_fst4 |>
separate(LocusName, into = c("SNP_id", "LocusName"), sep = "\\.")
# Merge the data frames
merged_df <- merge(my_fst4, snps, by.x = "SNP_id", by.y = "SNP")
# Rename the column
merged_df <- rename(merged_df, Chromosome = Scaffold)
# View the first few rows of the merged data frame
head(merged_df)## SNP_id LocusName He FST T1 T2 FSTNoCorr
## 1 AX-579436298 359893669 0.4987500 0.05206533 0.013369124 0.25677594 0.08105385
## 2 AX-579436317 359894534 0.1827061 0.07873031 0.007474384 0.09493655 0.10440289
## 3 AX-579436673 359886150 0.4676817 0.06359726 0.015403396 0.24220218 0.09720774
## 4 AX-579436698 359886370 0.1827061 0.05728006 0.005428505 0.09477129 0.10340749
## 5 AX-579436724 359914399 0.4831944 0.08175794 0.020540339 0.25123355 0.10509477
## 6 AX-579436772 359886810 0.0665874 0.06657702 0.002290796 0.03440821 0.09028018
## T1NoCorr T2NoCorr meanAlleleFreq Chromosome Cm Position Allele1 Allele2
## 1 0.020855153 0.25729996 0.5250000 2 0 359893669 C T
## 2 0.009928506 0.09509800 0.1016949 2 0 359894534 G T
## 3 0.023596318 0.24274116 0.3728814 2 0 359886150 T C
## 4 0.009830004 0.09506084 0.8983051 2 0 359886370 A T
## 5 0.026446784 0.25164700 0.5916667 2 0 359914399 G T
## 6 0.003110957 0.03445892 0.9655172 2 0 359886810 T C
Use ggplot to make a new plot
# Subset dataframe for He > 0.1
df_sub <- subset(merged_df, He > 0.1)
# Adjust my_out condition to fit merged_df
my_out <- P1$SNP_id[P1$OutlierFlag == TRUE]
# Save it
write.table(
my_out,
file = here("output", "outflank","SNPs_outFlank.txt"),
row.names = FALSE,
quote = FALSE,
col.names = FALSE,
sep = "\n"
)
# Create a new column in the dataframe to define highlight points based on my_out
df_sub$highlight <- ifelse(df_sub$SNP_id %in% my_out, TRUE, FALSE)
# source the plotting function
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
# Custom label function
k_format <- function(x) {
paste0(format(x / 1e6, big.mark = "", scientific = FALSE), "k")
}
# Set colors for the chromosomes
color_vector <- c("#CCF6D6", "#F6E1CC", "#CCD8F6") # Add or remove colors as needed.
names(color_vector) <- unique(df_sub$Chromosome) # Make sure unique(df_sub$Chromosome) gives all unique Chromosome values.
# Filter data to include only the highest FST value for each chromosome
highest_FST <- df_sub[which(df_sub$highlight == TRUE), ]
highest_FST <- highest_FST[which(highest_FST$FST == ave(highest_FST$FST, highest_FST$Chromosome, FUN = max)), ]
# Create the plot
ggplot(df_sub, aes(x = Position, y = FST)) +
geom_point(aes(color = Chromosome),
data = subset(df_sub, highlight == FALSE),
size = .3) +
geom_point(
color = "magenta",
data = subset(df_sub, highlight == TRUE),
size = .3
) +
scale_color_manual(values = color_vector, guide = "none") +
labs(x = "Position", y = "FST", caption = "") +
facet_wrap(~ Chromosome, scales = "free_x") +
scale_x_continuous(labels = k_format) +
my_theme() +
theme(panel.spacing = unit(.2, "lines"),
plot.margin = unit(c(1, 1, 2, 2), "lines"),
plot.caption = element_markdown(face = "italic", color = "#574E4E")) +
# Annotate only the highest highlighted point for each chromosome
geom_text_repel(
data = highest_FST,
aes(label = SNP_id),
size = 3,
nudge_y = 0.01,
segment.color = NA,
max.overlaps = Inf # Set to 'Inf' to show all annotations
)This tool does not directly account for population structure. It instead looks for loci that show a higher Fst than the neutral expectation, under a model of hierarchical population structure. The method is robust to hierarchical population structure and a range of demographic histories.
Check the documentation https://htmlpreview.github.io/?https://github.com/whitlock/OutFLANK/blob/master/inst/doc/OutFLANKAnalysis.html
“OutFLANK is an R package that implements the method developed by Whitlock and Lotterhos (2015) to use likelihood on a trimmed distribution of FST values to infer the distribution of FST for neutral markers. This distribution is then used to assign q-values to each locus to detect outliers that may be due to spatially heterogeneous selection.”
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3003362 160.4 4700540 251.1 NA 4700540 251.1
## Vcells 6422180 49.0 32928860 251.3 32768 41160966 314.1
Create vcf with intergenic SNPs that are not linked
plink2 \
--bfile output/outflank/man_aut \
--extract output/outflank/indepSNP.prune.in \
--export vcf \
--make-bed \
--out output/outflank/man_aut2 \
--silent;
grep 'samples\|variants\|remaining' output/outflank/man_aut2.log## 38 samples (13 females, 15 males, 10 ambiguous; 38 founders) loaded from
## 41100 variants loaded from output/outflank/man_aut.bim.
## --extract: 3638 variants remaining.
## 3638 variants remaining after main filters.
We need to import the data
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 3638
## column count: 47
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 3638
## Character matrix gt cols: 47
## skip: 0
## nrows: 3638
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant: 3638
## All variants processed
# Extract genotypes
geno <- vcfR::extract.gt(vcf)
# Extract locus names (positions)
locusNames <- vcfR::getPOS(vcf)
# Samples
sampleNames <- colnames(geno)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames <- sapply(strsplit(sampleNames, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))
# Recode genotypes
G[geno %in% c("0/0", "0|0")] <- 0
G[geno %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
G <- replace(G, is.na(G), 9)
# Check the counts of each genotype
table(as.vector(G))##
## 0 1 2 9
## 62350 41594 31320 2980
# Extract SNP names
snpNames <- vcfR::getID(vcf)
# Calculate FST with SNP names
my_fst <- MakeDiploidFSTMat(t(G), locusNames = snpNames, popNames = popNames)## Calculating FSTs, may take a few minutes...
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-583034838 0.18836565 -0.011429743 -0.001081881 0.09465486 0.01933784
## 2 AX-583038083 0.05124654 0.148905003 0.004246032 0.02851504 0.17431193
## 3 AX-583043989 0.41782323 0.008652319 0.001841270 0.21280652 0.04147982
## 4 AX-583044062 0.45810249 -0.015858927 -0.003670281 0.23143310 0.03530505
## 5 AX-583045992 0.48753463 -0.016407787 -0.004021400 0.24509096 0.02321162
## 6 AX-583046358 0.49382716 0.060951495 0.015855725 0.26013677 0.09411765
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.001843112 0.09531116 0.1052632
## 2 0.005000000 0.02868421 0.9736842
## 3 0.008888889 0.21429429 0.7027027
## 4 0.008265306 0.23411117 0.6447368
## 5 0.005739796 0.24728115 0.4210526
## 6 0.024691358 0.26234568 0.5555556
Check the FST distribution
# Create the histogram of FST values
ggplot(my_fst, aes(x=FST)) +
geom_histogram(binwidth=0.001, fill="blue", alpha=0.7) +
labs(title="Histogram of FST Values",
x="FST",
y="Frequency") +
theme_minimal()## Warning: Removed 67 rows containing non-finite values (`stat_bin()`).
We can select the SNPs with low FST to use as neutral SNPs
my_fst |>
filter(FST < 0.2 & FST >=-0.1) |>
dplyr::select(LocusName) -> neutral_snps
length(neutral_snps$LocusName)## [1] 2719
write.table(
neutral_snps$LocusName,
file = here("output", "outflank","man_aut2_SNPs.txt"),
row.names = FALSE,
quote = FALSE,
col.names = FALSE,
sep = "\n"
)Create a vcf
plink2 \
--bfile output/outflank/man_aut2 \
--export vcf \
--extract output/outflank/man_aut2_SNPs.txt \
--out output/outflank/man_aut3 \
--silent;
grep "samples\|variants" output/outflank/man_aut3.log ## 38 samples (13 females, 15 males, 10 ambiguous; 38 founders) loaded from
## 3638 variants loaded from output/outflank/man_aut2.bim.
## --extract: 2719 variants remaining.
## 2719 variants remaining after main filters.
Import the data.
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 2719
## column count: 47
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 2719
## Character matrix gt cols: 47
## skip: 0
## nrows: 2719
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant: 2719
## All variants processed
# vcf <- read.vcfR(here("output","outflank","intergenic.vcf"))
# Extract genotypes
geno <- vcfR::extract.gt(vcf)
# Extract locus names (positions)
locusNames <- vcfR::getPOS(vcf)
# Samples
sampleNames <- colnames(geno)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames <- sapply(strsplit(sampleNames, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))
# Recode genotypes
G[geno %in% c("0/0", "0|0")] <- 0
G[geno %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
G <- replace(G, is.na(G), 9)
# Check the counts of each genotype
table(as.vector(G))##
## 0 1 2 9
## 44510 34392 22166 2254
# Extract SNP names
snpNames <- vcfR::getID(vcf)
# Calculate FST with SNP names
my_fst <- MakeDiploidFSTMat(t(G), locusNames = snpNames, popNames = popNames)## Calculating FSTs, may take a few minutes...
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-583034838 0.18836565 -0.011429743 -0.001081881 0.09465486 0.01933784
## 2 AX-583038083 0.05124654 0.148905003 0.004246032 0.02851504 0.17431193
## 3 AX-583043989 0.41782323 0.008652319 0.001841270 0.21280652 0.04147982
## 4 AX-583044062 0.45810249 -0.015858927 -0.003670281 0.23143310 0.03530505
## 5 AX-583045992 0.48753463 -0.016407787 -0.004021400 0.24509096 0.02321162
## 6 AX-583046358 0.49382716 0.060951495 0.015855725 0.26013677 0.09411765
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.001843112 0.09531116 0.1052632
## 2 0.005000000 0.02868421 0.9736842
## 3 0.008888889 0.21429429 0.7027027
## 4 0.008265306 0.23411117 0.6447368
## 5 0.005739796 0.24728115 0.4210526
## 6 0.024691358 0.26234568 0.5555556
Check the FST distribution
# Create the histogram of FST values
ggplot(my_fst, aes(x=FST)) +
geom_histogram(binwidth=0.001, fill="blue", alpha=0.7) +
labs(title="Histogram of FST Values",
x="FST",
y="Frequency") +
theme_minimal()Data prep: decide which SNPs to use for calibrating the null distribution of Fst
# Run OutFLANK to estimate the neutral FST distribution
out_trim <- OutFLANK(FstDataFrame = my_fst, Hmin = 0.1, NumberOfSamples = 60, qthreshold = 0.05)
str(out_trim)## List of 6
## $ FSTbar : num 0.0326
## $ FSTNoCorrbar : num 0.0691
## $ dfInferred : num 2
## $ numberLowFstOutliers : int 0
## $ numberHighFstOutliers: int 0
## $ results :'data.frame': 2719 obs. of 15 variables:
## ..$ LocusName : chr [1:2719] "AX-583034838" "AX-583038083" "AX-583043989" "AX-583044062" ...
## ..$ He : num [1:2719] 0.1884 0.0512 0.4178 0.4581 0.4875 ...
## ..$ FST : num [1:2719] -0.01143 0.14891 0.00865 -0.01586 -0.01641 ...
## ..$ T1 : num [1:2719] -0.00108 0.00425 0.00184 -0.00367 -0.00402 ...
## ..$ T2 : num [1:2719] 0.0947 0.0285 0.2128 0.2314 0.2451 ...
## ..$ FSTNoCorr : num [1:2719] 0.0193 0.1743 0.0415 0.0353 0.0232 ...
## ..$ T1NoCorr : num [1:2719] 0.00184 0.005 0.00889 0.00827 0.00574 ...
## ..$ T2NoCorr : num [1:2719] 0.0953 0.0287 0.2143 0.2341 0.2473 ...
## ..$ meanAlleleFreq : num [1:2719] 0.105 0.974 0.703 0.645 0.421 ...
## ..$ indexOrder : int [1:2719] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ GoodH : chr [1:2719] "goodH" "lowH" "goodH" "goodH" ...
## ..$ qvalues : num [1:2719] 0.954 NA 0.954 0.954 0.954 ...
## ..$ pvalues : num [1:2719] 0.488 NA 0.902 0.8 0.57 ...
## ..$ pvaluesRightTail: num [1:2719] 0.756 NA 0.549 0.6 0.715 ...
## ..$ OutlierFlag : logi [1:2719] FALSE FALSE FALSE FALSE FALSE FALSE ...
Check the fit and make sure it looks good, especially in the right tail:
OutFLANKResultsPlotter(
out_trim,
withOutliers = TRUE,
NoCorr = TRUE,
Hmin = 0.1,
binwidth = 0.001,
Zoom = FALSE,
RightZoomFraction = 0.05,
titletext = NULL
)## Zoom in on right tail
OutFLANKResultsPlotter(
out_trim ,
withOutliers = TRUE,
NoCorr = TRUE,
Hmin = 0.1,
binwidth = 0.001,
Zoom =
TRUE,
RightZoomFraction = 0.05,
titletext = NULL
)Also check the P-value histogram: Here, we plot the “right-tailed” P-values, which means that outliers in the right tail of the FST distribution will have a P-value near zero. Because we ran the algorithm on a trimmed set of SNPs, this will remove some of the signal around selected sites. So we expect this histogram to be flat and maybe have a bump near 0 for selected sites. This histogram looks pretty good.
Now that we’ve estimated neutral mean FST and df to a quasi-independent set of SNPs, we can go and calculate P-values and q-values for all the loci in our dataset before prunning.
Note that it is important to run this code with the uncorrected FSTs (FSTNoCorr) and the uncorrected mean FST (FSTNoCorrbar).
Now we can use the entire data set to estimate the fst
Create a vcf - ld pruned data
plink2 \
--bfile output/outflank/man_aut \
--export vcf \
--make-bed \
--out output/outflank/man_aut4 \
--silent;
grep "samples\|variants" output/outflank/man_aut4.log ## 38 samples (13 females, 15 males, 10 ambiguous; 38 founders) loaded from
## 41100 variants loaded from output/outflank/man_aut.bim.
Import the data
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 41100
## column count: 47
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 41100
## Character matrix gt cols: 47
## skip: 0
## nrows: 41100
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant: 41100
## All variants processed
# Extract genotypes
geno2 <- vcfR::extract.gt(vcf2)
# Extract locus names (positions)
locusNames2 <- vcfR::getPOS(vcf2)
# Samples
sampleNames2 <- colnames(geno2)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames2 <- sapply(strsplit(sampleNames2, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
H <- matrix(NA, nrow = nrow(geno2), ncol = ncol(geno2))
# Recode genotypes
H[geno2 %in% c("0/0", "0|0")] <- 0
H[geno2 %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
H[geno2 %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
H <- replace(H, is.na(H), 9)
# Check the counts of each genotype
table(as.vector(H))##
## 0 1 2 9
## 746354 409679 366728 39039
# Extract SNP names
snpNames <- vcfR::getID(vcf2)
# Use the option below for names if you want to make the plot without having to add the postion later on
locusNames2 <- vcfR::getPOS(vcf2)
# Calculate FST
my_fst2 <- MakeDiploidFSTMat(t(H), locusNames = locusNames2, popNames = popNames2)## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 41100"
## [1] "20000 done of 41100"
## [1] "30000 done of 41100"
## [1] "40000 done of 41100"
my_fst3 <- MakeDiploidFSTMat(t(H), locusNames = snpNames, popNames = popNames2) # with SNP id instead of position## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 41100"
## [1] "20000 done of 41100"
## [1] "30000 done of 41100"
## [1] "40000 done of 41100"
# Concatenate the SNP names and the SNP positions
locusNames3 <- paste(snpNames, locusNames2, sep = ".")
# Calculate FST with the combined names
my_fst4 <- MakeDiploidFSTMat(t(H), locusNames = locusNames3, popNames = popNames2)## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 41100"
## [1] "20000 done of 41100"
## [1] "30000 done of 41100"
## [1] "40000 done of 41100"
## LocusName He FST T1 T2 FSTNoCorr
## 1 97856 0.07986111 0.26478873 0.0127995643 0.04833878 0.285714286
## 2 305518 0.05259313 -0.01270999 -0.0003360868 0.02644273 0.023886378
## 3 308124 0.44444444 -0.02261208 -0.0050121824 0.22165951 0.004779184
## 4 315059 0.28358726 -0.02558249 -0.0036107568 0.14114175 0.002873509
## 5 315386 0.49134349 0.02940734 0.0074629748 0.25377929 0.064840801
## 6 330908 0.49218750 -0.01199750 -0.0029965693 0.24976611 0.041616813
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.0138888889 0.04861111 0.9583333
## 2 0.0006377551 0.02669953 0.9729730
## 3 0.0010650888 0.22285996 0.6666667
## 4 0.0004081633 0.14204350 0.8289474
## 5 0.0165880102 0.25582673 0.5657895
## 6 0.0105019954 0.25234982 0.4375000
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-581444870 0.07986111 0.26478873 0.0127995643 0.04833878 0.285714286
## 2 AX-583035083 0.05259313 -0.01270999 -0.0003360868 0.02644273 0.023886378
## 3 AX-583035102 0.44444444 -0.02261208 -0.0050121824 0.22165951 0.004779184
## 4 AX-583033342 0.28358726 -0.02558249 -0.0036107568 0.14114175 0.002873509
## 5 AX-583035163 0.49134349 0.02940734 0.0074629748 0.25377929 0.064840801
## 6 AX-583035198 0.49218750 -0.01199750 -0.0029965693 0.24976611 0.041616813
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.0138888889 0.04861111 0.9583333
## 2 0.0006377551 0.02669953 0.9729730
## 3 0.0010650888 0.22285996 0.6666667
## 4 0.0004081633 0.14204350 0.8289474
## 5 0.0165880102 0.25582673 0.5657895
## 6 0.0105019954 0.25234982 0.4375000
## LocusName He FST T1 T2
## 1 AX-581444870.97856 0.07986111 0.26478873 0.0127995643 0.04833878
## 2 AX-583035083.305518 0.05259313 -0.01270999 -0.0003360868 0.02644273
## 3 AX-583035102.308124 0.44444444 -0.02261208 -0.0050121824 0.22165951
## 4 AX-583033342.315059 0.28358726 -0.02558249 -0.0036107568 0.14114175
## 5 AX-583035163.315386 0.49134349 0.02940734 0.0074629748 0.25377929
## 6 AX-583035198.330908 0.49218750 -0.01199750 -0.0029965693 0.24976611
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.285714286 0.0138888889 0.04861111 0.9583333
## 2 0.023886378 0.0006377551 0.02669953 0.9729730
## 3 0.004779184 0.0010650888 0.22285996 0.6666667
## 4 0.002873509 0.0004081633 0.14204350 0.8289474
## 5 0.064840801 0.0165880102 0.25582673 0.5657895
## 6 0.041616813 0.0105019954 0.25234982 0.4375000
Data checks: Heterozygosity vs. FST Here, you can see how some of the low H loci have high FST. These are all neutral loci in the simulation, and it is important to exclude them from the OutFLANK algorithm.
Note that it is important to run this code with the uncorrected FSTs (FSTNoCorr) and the uncorrected mean FST (FSTNoCorrbar).
P1 <-
pOutlierFinderChiSqNoCorr(
my_fst4,
Fstbar = out_trim$FSTNoCorrbar,
dfInferred = out_trim$dfInferred,
qthreshold = 0.05,
Hmin = 0.1
)
head(P1)## LocusName He FST T1 T2 FSTNoCorr
## 3 AX-583035102.308124 0.4444444 -0.02261208 -0.005012182 0.2216595 0.004779184
## 4 AX-583033342.315059 0.2835873 -0.02558249 -0.003610757 0.1411417 0.002873509
## 5 AX-583035163.315386 0.4913435 0.02940734 0.007462975 0.2537793 0.064840801
## 6 AX-583035198.330908 0.4921875 -0.01199750 -0.002996569 0.2497661 0.041616813
## 7 AX-583035257.442875 0.4220914 0.37571923 0.104276148 0.2775374 0.398183340
## 8 AX-583033504.492687 0.3681519 -0.01475510 -0.002717813 0.1841948 0.009483836
## T1NoCorr T2NoCorr meanAlleleFreq pvalues pvaluesRightTail qvalues
## 3 0.0010650888 0.2228600 0.6666667 0.13358643 0.933206783 0.71974901
## 4 0.0004081633 0.1420435 0.8289474 0.08142383 0.959288086 0.71974901
## 5 0.0165880102 0.2558267 0.5657895 0.78290644 0.391453219 0.50770500
## 6 0.0105019954 0.2523498 0.4375000 0.90453261 0.547733693 0.61746156
## 7 0.1111224490 0.2790736 0.3026316 0.00630525 0.003152625 0.02058336
## 8 0.0017558299 0.1851392 0.7567568 0.25637146 0.871814271 0.71974901
## OutlierFlag
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 TRUE
## 8 FALSE
## LocusName He FST T1 T2
## 41072 AX-583031456.488074006 0.07779401 0.25024387 0.0117913832 0.04711957
## 41075 AX-583033406.488107583 0.09972299 0.29082700 0.0179265873 0.06164004
## 41085 AX-583032024.488173766 0.02596953 0.06048352 0.0008258929 0.01365484
## 41090 AX-583034399.488580134 NA NA NA NA
## 41095 AX-583032854.488822384 NA NA NA NA
## 41100 AX-583034980.489339661 0.05259313 0.17868002 0.0053571429 0.02998177
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq pvalues pvaluesRightTail
## 41072 0.29133858 0.01388889 0.04767267 0.95945946 NA NA
## 41075 0.32203390 0.02000000 0.06210526 0.05263158 NA NA
## 41085 0.09090909 0.00125000 0.01375000 0.01315789 NA NA
## 41090 NA NA NA NA NA NA
## 41095 NA NA NA NA NA NA
## 41100 0.20441989 0.00617284 0.03019686 0.97297297 NA NA
## qvalues OutlierFlag
## 41072 NA NA
## 41075 NA NA
## 41085 NA NA
## 41090 NA NA
## 41095 NA NA
## 41100 NA NA
# notice how the output is ordered differently
my_out <- P1$OutlierFlag==TRUE
plot(P1$He, P1$FST, pch=19, col=rgb(0,0,0,0.1))
points(P1$He[my_out], P1$FST[my_out], col="blue")# check the P-value histogram for the full set of data
# if there are outliers, it should look flat with an inflation near 0Highlight outliers on Manhattan Plot
For publication, we want to show the accurate estimate of FST, not the uncorrected estimate. Remember to exclude those low H loci!
# Use the separate() function to split the LocusName column
P1 <- P1 |>
separate(LocusName, into = c("SNP_id", "LocusName"), sep = "\\.")
# Create plot
plot(P1$LocusName[P1$He>0.1], P1$FST[P1$He>0.1],
xlab="Position", ylab="FST", col=rgb(0,0,0,0.2))
points(P1$LocusName[my_out], P1$FST[my_out], col="magenta", pch=20)
output/local_adaptation/file3 We can use our bim file to create a plot
with the chromosomal ids
# ____________________________________________________________________________
# import the bim file with the SNP data ####
snps <-
read_delim( # to learn about the options use here, run ?read_delim on the console.
here(
"output", "outflank", "man_aut.bim"
), # use library here to load it
col_names = FALSE, # we don't have header in the input file
show_col_types = FALSE, # suppress message from read_delim
col_types = "ccidcc" # set the class of each column
)
#
# set column names
colnames(
snps
) <- # to add a header in our tibble
c(
"Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
)
#
# check the tibble
head(snps)## # A tibble: 6 × 6
## Scaffold SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-581444870 0 97856 C T
## 2 1 AX-583035083 0 305518 A G
## 3 1 AX-583035102 0 308124 A G
## 4 1 AX-583033342 0 315059 C G
## 5 1 AX-583035163 0 315386 A G
## 6 1 AX-583035198 0 330908 G T
Now we can merge it with my_fst3
# Use the separate() function to split the LocusName column
my_fst4 <- my_fst4 |>
separate(LocusName, into = c("SNP_id", "LocusName"), sep = "\\.")
# Merge the data frames
merged_df <- merge(my_fst4, snps, by.x = "SNP_id", by.y = "SNP")
# Rename the column
merged_df <- rename(merged_df, Chromosome = Scaffold)
# View the first few rows of the merged data frame
head(merged_df)## SNP_id LocusName He FST T1 T2
## 1 AX-579436298 359893669 0.4996537 0.15456552 0.043209680 0.2795558
## 2 AX-579436317 359894534 0.2717312 -0.03561166 -0.004804233 0.1349062
## 3 AX-579436673 359886150 0.4868517 0.12328032 0.033058201 0.2681547
## 4 AX-579436698 359886370 0.2337473 0.08069586 0.010149912 0.1257798
## 5 AX-579436724 359914399 0.4321330 -0.00560858 -0.001224490 0.2183244
## 6 AX-579436772 359886810 NA NA NA NA
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq Chromosome Cm Position
## 1 0.18354204 0.0516581633 0.2814514 0.5131579 2 0 359893669
## 2 0.00102162 0.0001388889 0.1359497 0.1621622 2 0 359894534
## 3 0.16625062 0.0450000000 0.2706757 0.4189189 2 0 359886150
## 4 0.13474144 0.0171467764 0.1272569 0.8648649 2 0 359886370
## 5 0.02967591 0.0065306122 0.2200644 0.6842105 2 0 359914399
## 6 NA NA NA NA 2 0 359886810
## Allele1 Allele2
## 1 C T
## 2 G T
## 3 T C
## 4 A T
## 5 G T
## 6 T C
Use ggplot to make a new plot
# Subset dataframe for He > 0.1
df_sub <- subset(merged_df, He > 0.1)
# Adjust my_out condition to fit merged_df
my_out <- P1$SNP_id[P1$OutlierFlag == TRUE]
# Save it
write.table(
my_out,
file = here("output", "outflank","man_aut_SNPs_outFlank.txt"),
row.names = FALSE,
quote = FALSE,
col.names = FALSE,
sep = "\n"
)
# Create a new column in the dataframe to define highlight points based on my_out
df_sub$highlight <- ifelse(df_sub$SNP_id %in% my_out, TRUE, FALSE)
# source the plotting function
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
# Custom label function
k_format <- function(x) {
paste0(format(x / 1e6, big.mark = "", scientific = FALSE), "k")
}
# Set colors for the chromosomes
color_vector <- c("#CCF6D6", "#F6E1CC", "#CCD8F6") # Add or remove colors as needed.
names(color_vector) <- unique(df_sub$Chromosome) # Make sure unique(df_sub$Chromosome) gives all unique Chromosome values.
# Filter data to include only the highest FST value for each chromosome
highest_FST <- df_sub[which(df_sub$highlight == TRUE), ]
highest_FST <- highest_FST[which(highest_FST$FST == ave(highest_FST$FST, highest_FST$Chromosome, FUN = max)), ]
# Create the plot
ggplot(df_sub, aes(x = Position, y = FST)) +
geom_point(aes(color = Chromosome),
data = subset(df_sub, highlight == FALSE),
size = .3) +
geom_point(
color = "magenta",
data = subset(df_sub, highlight == TRUE),
size = .3
) +
scale_color_manual(values = color_vector, guide = "none") +
labs(x = "Position", y = "FST", caption = "") +
facet_wrap(~ Chromosome, scales = "free_x") +
scale_x_continuous(labels = k_format) +
my_theme() +
theme(panel.spacing = unit(.2, "lines"),
plot.margin = unit(c(1, 1, 2, 2), "lines"),
plot.caption = element_markdown(face = "italic", color = "#574E4E")) +
# Annotate only the highest highlighted point for each chromosome
geom_text_repel(
data = highest_FST,
aes(label = SNP_id),
size = 3,
nudge_y = 0.01,
segment.color = NA,
max.overlaps = Inf # Set to 'Inf' to show all annotations
)This tool does not directly account for population structure. It instead looks for loci that show a higher Fst than the neutral expectation, under a model of hierarchical population structure. The method is robust to hierarchical population structure and a range of demographic histories.
Check the documentation https://htmlpreview.github.io/?https://github.com/whitlock/OutFLANK/blob/master/inst/doc/OutFLANKAnalysis.html
“OutFLANK is an R package that implements the method developed by Whitlock and Lotterhos (2015) to use likelihood on a trimmed distribution of FST values to infer the distribution of FST for neutral markers. This distribution is then used to assign q-values to each locus to detect outliers that may be due to spatially heterogeneous selection.”
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3004736 160.5 4700540 251.1 NA 4700540 251.1
## Vcells 6339451 48.4 26343088 201.0 32768 41160966 314.1
Create vcf with intergenic SNPs that are not linked
plink2 \
--bfile output/outflank/new_aut \
--extract output/outflank/indepSNP.prune.in \
--export vcf \
--make-bed \
--out output/outflank/new_aut2 \
--silent;
grep 'samples\|variants\|remaining' output/outflank/new_aut2.log## 50 samples (24 females, 26 males; 50 founders) loaded from
## 41468 variants loaded from output/outflank/new_aut.bim.
## --extract: 3656 variants remaining.
## 3656 variants remaining after main filters.
We need to import the data
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 3656
## column count: 59
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 3656
## Character matrix gt cols: 59
## skip: 0
## nrows: 3656
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant: 3656
## All variants processed
# Extract genotypes
geno <- vcfR::extract.gt(vcf)
# Extract locus names (positions)
locusNames <- vcfR::getPOS(vcf)
# Samples
sampleNames <- colnames(geno)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames <- sapply(strsplit(sampleNames, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))
# Recode genotypes
G[geno %in% c("0/0", "0|0")] <- 0
G[geno %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
G <- replace(G, is.na(G), 9)
# Check the counts of each genotype
table(as.vector(G))##
## 0 1 2 9
## 82640 54795 41607 3758
# Extract SNP names
snpNames <- vcfR::getID(vcf)
# Calculate FST with SNP names
my_fst <- MakeDiploidFSTMat(t(G), locusNames = snpNames, popNames = popNames)## Calculating FSTs, may take a few minutes...
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-583034838 0.2822000 0.09331193 0.013958771 0.1495926 0.1124438309
## 2 AX-583038083 NA NA NA NA NA
## 3 AX-583043989 0.4398167 -0.01852295 -0.004073618 0.2199228 0.0005218262
## 4 AX-583044062 0.5000000 0.25780281 0.075192622 0.2916672 0.2822587929
## 5 AX-583045992 0.4872000 -0.01515694 -0.003705634 0.2444843 0.0077800316
## 6 AX-583046358 0.4991319 -0.01921037 -0.004803532 0.2500490 0.0045325779
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.0168254027 0.1496338 0.1700000
## 2 NA NA NA
## 3 0.0001147842 0.2199664 0.6734694
## 4 0.0823547394 0.2917703 0.5000000
## 5 0.0019027239 0.2445651 0.4200000
## 6 0.0011337868 0.2501417 0.4791667
Check the FST distribution
# Create the histogram of FST values
ggplot(my_fst, aes(x=FST)) +
geom_histogram(binwidth=0.001, fill="blue", alpha=0.7) +
labs(title="Histogram of FST Values",
x="FST",
y="Frequency") +
theme_minimal()## Warning: Removed 44 rows containing non-finite values (`stat_bin()`).
We can select the SNPs with low FST to use as neutral SNPs
my_fst |>
filter(FST < 0.2 & FST >=-0.1) |>
dplyr::select(LocusName) -> neutral_snps
length(neutral_snps$LocusName)## [1] 3024
write.table(
neutral_snps$LocusName,
file = here("output", "outflank","new_aut2_SNPs.txt"),
row.names = FALSE,
quote = FALSE,
col.names = FALSE,
sep = "\n"
)Create a vcf
plink2 \
--bfile output/outflank/new_aut2 \
--export vcf \
--extract output/outflank/new_aut2_SNPs.txt \
--out output/outflank/new_aut3 \
--silent;
grep "samples\|variants" output/outflank/new_aut3.log ## 50 samples (24 females, 26 males; 50 founders) loaded from
## 3656 variants loaded from output/outflank/new_aut2.bim.
## --extract: 3024 variants remaining.
## 3024 variants remaining after main filters.
Import the data.
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 3024
## column count: 59
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 3024
## Character matrix gt cols: 59
## skip: 0
## nrows: 3024
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant: 3024
## All variants processed
# vcf <- read.vcfR(here("output","outflank","intergenic.vcf"))
# Extract genotypes
geno <- vcfR::extract.gt(vcf)
# Extract locus names (positions)
locusNames <- vcfR::getPOS(vcf)
# Samples
sampleNames <- colnames(geno)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames <- sapply(strsplit(sampleNames, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))
# Recode genotypes
G[geno %in% c("0/0", "0|0")] <- 0
G[geno %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
G <- replace(G, is.na(G), 9)
# Check the counts of each genotype
table(as.vector(G))##
## 0 1 2 9
## 68485 46869 32840 3006
# Extract SNP names
snpNames <- vcfR::getID(vcf)
# Calculate FST with SNP names
my_fst <- MakeDiploidFSTMat(t(G), locusNames = snpNames, popNames = popNames)## Calculating FSTs, may take a few minutes...
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-583034838 0.28220000 0.093311934 1.395877e-02 0.14959256 0.1124438309
## 2 AX-583043989 0.43981674 -0.018522950 -4.073618e-03 0.21992275 0.0005218262
## 3 AX-583045992 0.48720000 -0.015156940 -3.705634e-03 0.24448430 0.0077800316
## 4 AX-583046358 0.49913194 -0.019210365 -4.803532e-03 0.25004895 0.0045325779
## 5 AX-583051309 0.02150284 -0.005233131 -5.673413e-05 0.01084134 0.0170496664
## 6 AX-583056820 0.44220000 -0.007525669 -1.683139e-03 0.22365304 0.0233733855
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.0168254027 0.14963384 0.1700000
## 2 0.0001147842 0.21996636 0.6734694
## 3 0.0019027239 0.24456506 0.4200000
## 4 0.0011337868 0.25014172 0.4791667
## 5 0.0001849112 0.01084545 0.9891304
## 6 0.0052298554 0.22375258 0.6700000
Check the FST distribution
# Create the histogram of FST values
ggplot(my_fst, aes(x=FST)) +
geom_histogram(binwidth=0.001, fill="blue", alpha=0.7) +
labs(title="Histogram of FST Values",
x="FST",
y="Frequency") +
theme_minimal()Data prep: decide which SNPs to use for calibrating the null distribution of Fst
# Run OutFLANK to estimate the neutral FST distribution
out_trim <- OutFLANK(FstDataFrame = my_fst, Hmin = 0.1, NumberOfSamples = 60, qthreshold = 0.05)
str(out_trim)## List of 6
## $ FSTbar : num 0.0414
## $ FSTNoCorrbar : num 0.0635
## $ dfInferred : num 2
## $ numberLowFstOutliers : int 0
## $ numberHighFstOutliers: int 0
## $ results :'data.frame': 3024 obs. of 15 variables:
## ..$ LocusName : chr [1:3024] "AX-583034838" "AX-583043989" "AX-583045992" "AX-583046358" ...
## ..$ He : num [1:3024] 0.2822 0.4398 0.4872 0.4991 0.0215 ...
## ..$ FST : num [1:3024] 0.09331 -0.01852 -0.01516 -0.01921 -0.00523 ...
## ..$ T1 : num [1:3024] 1.40e-02 -4.07e-03 -3.71e-03 -4.80e-03 -5.67e-05 ...
## ..$ T2 : num [1:3024] 0.1496 0.2199 0.2445 0.25 0.0108 ...
## ..$ FSTNoCorr : num [1:3024] 0.112444 0.000522 0.00778 0.004533 0.01705 ...
## ..$ T1NoCorr : num [1:3024] 0.016825 0.000115 0.001903 0.001134 0.000185 ...
## ..$ T2NoCorr : num [1:3024] 0.1496 0.22 0.2446 0.2501 0.0108 ...
## ..$ meanAlleleFreq : num [1:3024] 0.17 0.673 0.42 0.479 0.989 ...
## ..$ indexOrder : int [1:3024] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ GoodH : chr [1:3024] "goodH" "goodH" "goodH" "goodH" ...
## ..$ qvalues : num [1:3024] 0.693 0.928 0.928 0.928 NA ...
## ..$ pvalues : num [1:3024] 0.3409 0.0164 0.2305 0.1377 NA ...
## ..$ pvaluesRightTail: num [1:3024] 0.17 0.992 0.885 0.931 NA ...
## ..$ OutlierFlag : logi [1:3024] FALSE FALSE FALSE FALSE FALSE FALSE ...
Check the fit and make sure it looks good, especially in the right tail:
OutFLANKResultsPlotter(
out_trim,
withOutliers = TRUE,
NoCorr = TRUE,
Hmin = 0.1,
binwidth = 0.001,
Zoom = FALSE,
RightZoomFraction = 0.05,
titletext = NULL
)## Zoom in on right tail
OutFLANKResultsPlotter(
out_trim ,
withOutliers = TRUE,
NoCorr = TRUE,
Hmin = 0.1,
binwidth = 0.001,
Zoom =
TRUE,
RightZoomFraction = 0.05,
titletext = NULL
)Also check the P-value histogram: Here, we plot the “right-tailed” P-values, which means that outliers in the right tail of the FST distribution will have a P-value near zero. Because we ran the algorithm on a trimmed set of SNPs, this will remove some of the signal around selected sites. So we expect this histogram to be flat and maybe have a bump near 0 for selected sites. This histogram looks pretty good.
Now that we’ve estimated neutral mean FST and df to a quasi-independent set of SNPs, we can go and calculate P-values and q-values for all the loci in our dataset before prunning.
Note that it is important to run this code with the uncorrected FSTs (FSTNoCorr) and the uncorrected mean FST (FSTNoCorrbar).
Now we can use the entire data set to estimate the fst
Create a vcf - ld pruned data
plink2 \
--bfile output/outflank/new_aut \
--export vcf \
--make-bed \
--out output/outflank/new_aut4 \
--silent;
grep "samples\|variants" output/outflank/new_aut4.log ## 50 samples (24 females, 26 males; 50 founders) loaded from
## 41468 variants loaded from output/outflank/new_aut.bim.
Import the data
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 41468
## column count: 59
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 41468
## Character matrix gt cols: 59
## skip: 0
## nrows: 41468
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant: 41468
## All variants processed
# Extract genotypes
geno2 <- vcfR::extract.gt(vcf2)
# Extract locus names (positions)
locusNames2 <- vcfR::getPOS(vcf2)
# Samples
sampleNames2 <- colnames(geno2)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames2 <- sapply(strsplit(sampleNames2, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
H <- matrix(NA, nrow = nrow(geno2), ncol = ncol(geno2))
# Recode genotypes
H[geno2 %in% c("0/0", "0|0")] <- 0
H[geno2 %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
H[geno2 %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
H <- replace(H, is.na(H), 9)
# Check the counts of each genotype
table(as.vector(H))##
## 0 1 2 9
## 992388 542424 485838 52750
# Extract SNP names
snpNames <- vcfR::getID(vcf2)
# Use the option below for names if you want to make the plot without having to add the postion later on
locusNames2 <- vcfR::getPOS(vcf2)
# Calculate FST
my_fst2 <- MakeDiploidFSTMat(t(H), locusNames = locusNames2, popNames = popNames2)## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 41468"
## [1] "20000 done of 41468"
## [1] "30000 done of 41468"
## [1] "40000 done of 41468"
my_fst3 <- MakeDiploidFSTMat(t(H), locusNames = snpNames, popNames = popNames2) # with SNP id instead of position## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 41468"
## [1] "20000 done of 41468"
## [1] "30000 done of 41468"
## [1] "40000 done of 41468"
# Concatenate the SNP names and the SNP positions
locusNames3 <- paste(snpNames, locusNames2, sep = ".")
# Calculate FST with the combined names
my_fst4 <- MakeDiploidFSTMat(t(H), locusNames = locusNames3, popNames = popNames2)## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 41468"
## [1] "20000 done of 41468"
## [1] "30000 done of 41468"
## [1] "40000 done of 41468"
## LocusName He FST T1 T2 FSTNoCorr
## 1 97856 0.19929196 0.26029330 0.0300433412 0.11542111 0.2707182320
## 2 305518 0.06309074 -0.02204193 -0.0006949276 0.03152753 0.0009978524
## 3 308124 0.39579395 0.04920886 0.0101009868 0.20526764 0.0713278675
## 4 315059 0.24080000 0.01464781 0.0017917097 0.12231931 0.0314048163
## 5 315386 0.48000000 0.05111637 0.0127162581 0.24877078 0.0700303844
## 6 330908 0.49524093 -0.02744264 -0.0068182185 0.24845342 0.0076309429
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 3.125000e-02 0.11543367 0.8877551
## 2 3.149408e-05 0.03156186 0.9673913
## 3 1.464682e-02 0.20534498 0.7282609
## 4 3.842343e-03 0.12234884 0.8600000
## 5 1.742626e-02 0.24883860 0.6000000
## 6 1.896923e-03 0.24858303 0.4512195
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-581444870 0.19929196 0.26029330 0.0300433412 0.11542111 0.2707182320
## 2 AX-583035083 0.06309074 -0.02204193 -0.0006949276 0.03152753 0.0009978524
## 3 AX-583035102 0.39579395 0.04920886 0.0101009868 0.20526764 0.0713278675
## 4 AX-583033342 0.24080000 0.01464781 0.0017917097 0.12231931 0.0314048163
## 5 AX-583035163 0.48000000 0.05111637 0.0127162581 0.24877078 0.0700303844
## 6 AX-583035198 0.49524093 -0.02744264 -0.0068182185 0.24845342 0.0076309429
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 3.125000e-02 0.11543367 0.8877551
## 2 3.149408e-05 0.03156186 0.9673913
## 3 1.464682e-02 0.20534498 0.7282609
## 4 3.842343e-03 0.12234884 0.8600000
## 5 1.742626e-02 0.24883860 0.6000000
## 6 1.896923e-03 0.24858303 0.4512195
## LocusName He FST T1 T2
## 1 AX-581444870.97856 0.19929196 0.26029330 0.0300433412 0.11542111
## 2 AX-583035083.305518 0.06309074 -0.02204193 -0.0006949276 0.03152753
## 3 AX-583035102.308124 0.39579395 0.04920886 0.0101009868 0.20526764
## 4 AX-583033342.315059 0.24080000 0.01464781 0.0017917097 0.12231931
## 5 AX-583035163.315386 0.48000000 0.05111637 0.0127162581 0.24877078
## 6 AX-583035198.330908 0.49524093 -0.02744264 -0.0068182185 0.24845342
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.2707182320 3.125000e-02 0.11543367 0.8877551
## 2 0.0009978524 3.149408e-05 0.03156186 0.9673913
## 3 0.0713278675 1.464682e-02 0.20534498 0.7282609
## 4 0.0314048163 3.842343e-03 0.12234884 0.8600000
## 5 0.0700303844 1.742626e-02 0.24883860 0.6000000
## 6 0.0076309429 1.896923e-03 0.24858303 0.4512195
Data checks: Heterozygosity vs. FST Here, you can see how some of the low H loci have high FST. These are all neutral loci in the simulation, and it is important to exclude them from the OutFLANK algorithm.
Note that it is important to run this code with the uncorrected FSTs (FSTNoCorr) and the uncorrected mean FST (FSTNoCorrbar).
P1 <-
pOutlierFinderChiSqNoCorr(
my_fst4,
Fstbar = out_trim$FSTNoCorrbar,
dfInferred = out_trim$dfInferred,
qthreshold = 0.05,
Hmin = 0.1
)
head(P1)## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-581444870.97856 0.1992920 0.26029330 0.030043341 0.1154211 0.270718232
## 3 AX-583035102.308124 0.3957940 0.04920886 0.010100987 0.2052676 0.071327867
## 4 AX-583033342.315059 0.2408000 0.01464781 0.001791710 0.1223193 0.031404816
## 5 AX-583035163.315386 0.4800000 0.05111637 0.012716258 0.2487708 0.070030384
## 6 AX-583035198.330908 0.4952409 -0.02744264 -0.006818219 0.2484534 0.007630943
## 7 AX-583035257.442875 0.4550000 0.27186428 0.072300598 0.2659437 0.285336102
## T1NoCorr T2NoCorr meanAlleleFreq pvalues pvaluesRightTail qvalues
## 1 0.031250000 0.1154337 0.8877551 0.02824556 0.01412278 0.06691806
## 3 0.014646820 0.2053450 0.7282609 0.65099497 0.32549748 0.44537485
## 4 0.003842343 0.1223488 0.8600000 0.77985769 0.61007115 0.64023181
## 5 0.017426263 0.2488386 0.6000000 0.66442289 0.33221145 0.45178842
## 6 0.001896923 0.2485830 0.4512195 0.22629914 0.88685043 0.70960166
## 7 0.075898128 0.2659955 0.3500000 0.02244152 0.01122076 0.05848341
## OutlierFlag
## 1 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 FALSE
## LocusName He FST T1 T2
## 41437 AX-583031428.488069712 0.09500000 0.10278737 0.005217173 0.05075695
## 41439 AX-583031456.488074006 0.09683465 0.10299235 0.005355334 0.05199739
## 41441 AX-583031495.488076823 0.00000000 0.00000000 0.000000000 0.00000000
## 41459 AX-583032613.488633131 0.07830071 0.07074236 0.002930960 0.04143147
## 41463 AX-583032854.488822384 0.05935027 0.04271123 0.001317169 0.03083894
## 41468 AX-583034980.489339661 0.07830071 0.08142327 0.003374222 0.04144051
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq pvalues pvaluesRightTail
## 41437 0.12716175 0.006456612 0.05077479 0.9500000 NA NA
## 41439 0.13618677 0.007086168 0.05203272 0.9489796 NA NA
## 41441 0.00000000 0.000000000 0.00000000 0.0000000 NA NA
## 41459 0.10937500 0.004535147 0.04146420 0.9591837 NA NA
## 41463 0.07534598 0.002324380 0.03084943 0.9693878 NA NA
## 41468 0.10937500 0.004535147 0.04146420 0.9591837 NA NA
## qvalues OutlierFlag
## 41437 NA NA
## 41439 NA NA
## 41441 NA NA
## 41459 NA NA
## 41463 NA NA
## 41468 NA NA
# notice how the output is ordered differently
my_out <- P1$OutlierFlag==TRUE
plot(P1$He, P1$FST, pch=19, col=rgb(0,0,0,0.1))
points(P1$He[my_out], P1$FST[my_out], col="blue")# check the P-value histogram for the full set of data
# if there are outliers, it should look flat with an inflation near 0Highlight outliers on Manhattan Plot
For publication, we want to show the accurate estimate of FST, not the uncorrected estimate. Remember to exclude those low H loci!
# Use the separate() function to split the LocusName column
P1 <- P1 |>
separate(LocusName, into = c("SNP_id", "LocusName"), sep = "\\.")
# Create plot
plot(P1$LocusName[P1$He>0.1], P1$FST[P1$He>0.1],
xlab="Position", ylab="FST", col=rgb(0,0,0,0.2))
points(P1$LocusName[my_out], P1$FST[my_out], col="magenta", pch=20)
output/local_adaptation/file3 We can use our bim file to create a plot
with the chromosomal ids
# ____________________________________________________________________________
# import the bim file with the SNP data ####
snps <-
read_delim( # to learn about the options use here, run ?read_delim on the console.
here(
"output", "outflank", "man_aut.bim"
), # use library here to load it
col_names = FALSE, # we don't have header in the input file
show_col_types = FALSE, # suppress message from read_delim
col_types = "ccidcc" # set the class of each column
)
#
# set column names
colnames(
snps
) <- # to add a header in our tibble
c(
"Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
)
#
# check the tibble
head(snps)## # A tibble: 6 × 6
## Scaffold SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-581444870 0 97856 C T
## 2 1 AX-583035083 0 305518 A G
## 3 1 AX-583035102 0 308124 A G
## 4 1 AX-583033342 0 315059 C G
## 5 1 AX-583035163 0 315386 A G
## 6 1 AX-583035198 0 330908 G T
Now we can merge it with my_fst3
# Use the separate() function to split the LocusName column
my_fst4 <- my_fst4 |>
separate(LocusName, into = c("SNP_id", "LocusName"), sep = "\\.")
# Merge the data frames
merged_df <- merge(my_fst4, snps, by.x = "SNP_id", by.y = "SNP")
# Rename the column
merged_df <- rename(merged_df, Chromosome = Scaffold)
# View the first few rows of the merged data frame
head(merged_df)## SNP_id LocusName He FST T1 T2
## 1 AX-579436298 359893669 0.49920000 0.00256442 0.0006487357 0.25297558
## 2 AX-579436317 359894534 0.16680550 0.13906779 0.0125725338 0.09040579
## 3 AX-579436673 359886150 0.48313203 0.05747358 0.0144872321 0.25206766
## 4 AX-579436698 359886370 0.21491045 0.05152078 0.0057881451 0.11234583
## 5 AX-579436724 359914399 0.48380000 0.13653755 0.0357830794 0.26207502
## 6 AX-579436772 359886810 0.07986111 0.08060647 0.0033859421 0.04200584
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq Chromosome Cm Position
## 1 0.02699244 0.006830832 0.25306460 0.48000000 2 0 359893669
## 2 0.15360502 0.013888889 0.09041950 0.09183673 2 0 359894534
## 3 0.08296925 0.020919421 0.25213463 0.40816327 2 0 359886150
## 4 0.08686331 0.009762326 0.11238721 0.87755102 2 0 359886370
## 5 0.15218904 0.039893953 0.26213421 0.59000000 2 0 359914399
## 6 0.09836066 0.004132231 0.04201102 0.95833333 2 0 359886810
## Allele1 Allele2
## 1 C T
## 2 G T
## 3 T C
## 4 A T
## 5 G T
## 6 T C
Use ggplot to make a new plot
# Subset dataframe for He > 0.1
df_sub <- subset(merged_df, He > 0.1)
# Adjust my_out condition to fit merged_df
my_out <- P1$SNP_id[P1$OutlierFlag == TRUE]
# Save it
write.table(
my_out,
file = here("output", "outflank","new_aut_SNPs_outFlank.txt"),
row.names = FALSE,
quote = FALSE,
col.names = FALSE,
sep = "\n"
)
# Create a new column in the dataframe to define highlight points based on my_out
df_sub$highlight <- ifelse(df_sub$SNP_id %in% my_out, TRUE, FALSE)
# source the plotting function
source(
here(
"scripts", "analysis", "my_theme2.R"
)
)
# Custom label function
k_format <- function(x) {
paste0(format(x / 1e6, big.mark = "", scientific = FALSE), "k")
}
# Set colors for the chromosomes
color_vector <- c("#CCF6D6", "#F6E1CC", "#CCD8F6") # Add or remove colors as needed.
names(color_vector) <- unique(df_sub$Chromosome) # Make sure unique(df_sub$Chromosome) gives all unique Chromosome values.
# Filter data to include only the highest FST value for each chromosome
highest_FST <- df_sub[which(df_sub$highlight == TRUE), ]
highest_FST <- highest_FST[which(highest_FST$FST == ave(highest_FST$FST, highest_FST$Chromosome, FUN = max)), ]
# Create the plot
ggplot(df_sub, aes(x = Position, y = FST)) +
geom_point(aes(color = Chromosome),
data = subset(df_sub, highlight == FALSE),
size = .3) +
geom_point(
color = "magenta",
data = subset(df_sub, highlight == TRUE),
size = .3
) +
scale_color_manual(values = color_vector, guide = "none") +
labs(x = "Position", y = "FST", caption = "") +
facet_wrap(~ Chromosome, scales = "free_x") +
scale_x_continuous(labels = k_format) +
my_theme() +
theme(panel.spacing = unit(.2, "lines"),
plot.margin = unit(c(1, 1, 2, 2), "lines"),
plot.caption = element_markdown(face = "italic", color = "#574E4E")) +
# Annotate only the highest highlighted point for each chromosome
geom_text_repel(
data = highest_FST,
aes(label = SNP_id),
size = 3,
nudge_y = 0.01,
segment.color = NA,
max.overlaps = Inf # Set to 'Inf' to show all annotations
)This tool does not directly account for population structure. It instead looks for loci that show a higher Fst than the neutral expectation, under a model of hierarchical population structure. The method is robust to hierarchical population structure and a range of demographic histories.
Check the documentation https://htmlpreview.github.io/?https://github.com/whitlock/OutFLANK/blob/master/inst/doc/OutFLANKAnalysis.html
“OutFLANK is an R package that implements the method developed by Whitlock and Lotterhos (2015) to use likelihood on a trimmed distribution of FST values to infer the distribution of FST for neutral markers. This distribution is then used to assign q-values to each locus to detect outliers that may be due to spatially heterogeneous selection.”
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3007106 160.6 4700540 251.1 NA 4700540 251.1
## Vcells 6375213 48.7 31142019 237.6 32768 41160966 314.1
Create vcf with intergenic SNPs that are not linked
plink2 \
--bfile output/outflank/man_new \
--extract output/outflank/indepSNP.prune.in \
--export vcf \
--make-bed \
--out output/outflank/man_new2 \
--silent;
grep 'samples\|variants\|remaining' output/outflank/man_new2.log## 32 samples (11 females, 11 males, 10 ambiguous; 32 founders) loaded from
## 40985 variants loaded from output/outflank/man_new.bim.
## --extract: 3652 variants remaining.
## 3652 variants remaining after main filters.
We need to import the data
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 3652
## column count: 41
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 3652
## Character matrix gt cols: 41
## skip: 0
## nrows: 3652
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant: 3652
## All variants processed
# Extract genotypes
geno <- vcfR::extract.gt(vcf)
# Extract locus names (positions)
locusNames <- vcfR::getPOS(vcf)
# Samples
sampleNames <- colnames(geno)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames <- sapply(strsplit(sampleNames, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))
# Recode genotypes
G[geno %in% c("0/0", "0|0")] <- 0
G[geno %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
G <- replace(G, is.na(G), 9)
# Check the counts of each genotype
table(as.vector(G))##
## 0 1 2 9
## 53673 34876 25686 2629
# Extract SNP names
snpNames <- vcfR::getID(vcf)
# Calculate FST with SNP names
my_fst <- MakeDiploidFSTMat(t(G), locusNames = snpNames, popNames = popNames)## Calculating FSTs, may take a few minutes...
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-583034838 0.35888672 0.002890432 0.0005282369 0.18275362 0.040987581
## 2 AX-583038083 0.06054688 0.122172465 0.0040303030 0.03298864 0.150943396
## 3 AX-583043989 0.40429688 -0.005819247 -0.0011928375 0.20498140 0.033878768
## 4 AX-583044062 0.46044922 0.089906003 0.0224676309 0.24990134 0.152450797
## 5 AX-583045992 0.49804688 -0.039363467 -0.0097658402 0.24809401 0.004138645
## 6 AX-583046358 0.49777778 0.088182948 0.0236295576 0.26796062 0.134838800
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.007530992 0.1837384 0.2343750
## 2 0.005000000 0.0331250 0.9687500
## 3 0.006983471 0.2061312 0.7187500
## 4 0.038440083 0.2521475 0.3593750
## 5 0.001033058 0.2496126 0.4687500
## 6 0.036407155 0.2700050 0.5333333
Check the FST distribution
# Create the histogram of FST values
ggplot(my_fst, aes(x=FST)) +
geom_histogram(binwidth=0.001, fill="blue", alpha=0.7) +
labs(title="Histogram of FST Values",
x="FST",
y="Frequency") +
theme_minimal()## Warning: Removed 11 rows containing non-finite values (`stat_bin()`).
We can select the SNPs with low FST to use as neutral SNPs
my_fst |>
filter(FST < 0.2 & FST >=-0.1) |>
dplyr::select(LocusName) -> neutral_snps
length(neutral_snps$LocusName)## [1] 3390
write.table(
neutral_snps$LocusName,
file = here("output", "outflank","man_new2_SNPs.txt"),
row.names = FALSE,
quote = FALSE,
col.names = FALSE,
sep = "\n"
)Create a vcf
plink2 \
--bfile output/outflank/man_new2 \
--export vcf \
--extract output/outflank/new_aut2_SNPs.txt \
--out output/outflank/man_new3 \
--silent;
grep "samples\|variants" output/outflank/man_new3.log ## 32 samples (11 females, 11 males, 10 ambiguous; 32 founders) loaded from
## 3652 variants loaded from output/outflank/man_new2.bim.
## --extract: 3011 variants remaining.
## 3011 variants remaining after main filters.
Import the data.
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 3011
## column count: 41
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 3011
## Character matrix gt cols: 41
## skip: 0
## nrows: 3011
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant: 3011
## All variants processed
# vcf <- read.vcfR(here("output","outflank","intergenic.vcf"))
# Extract genotypes
geno <- vcfR::extract.gt(vcf)
# Extract locus names (positions)
locusNames <- vcfR::getPOS(vcf)
# Samples
sampleNames <- colnames(geno)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames <- sapply(strsplit(sampleNames, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))
# Recode genotypes
G[geno %in% c("0/0", "0|0")] <- 0
G[geno %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
G <- replace(G, is.na(G), 9)
# Check the counts of each genotype
table(as.vector(G))##
## 0 1 2 9
## 44312 28896 21027 2117
# Extract SNP names
snpNames <- vcfR::getID(vcf)
# Calculate FST with SNP names
my_fst <- MakeDiploidFSTMat(t(G), locusNames = snpNames, popNames = popNames)## Calculating FSTs, may take a few minutes...
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-583034838 0.3588867 0.002890432 0.0005282369 0.18275362 0.040987581
## 2 AX-583043989 0.4042969 -0.005819247 -0.0011928375 0.20498140 0.033878768
## 3 AX-583045992 0.4980469 -0.039363467 -0.0097658402 0.24809401 0.004138645
## 4 AX-583046358 0.4977778 0.088182948 0.0236295576 0.26796062 0.134838800
## 5 AX-583051309 0.0980975 0.192799799 0.0109053498 0.05656308 0.243697479
## 6 AX-583056820 0.3901367 -0.054848101 -0.0106232782 0.19368543 0.001322970
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.0075309917 0.18373838 0.2343750
## 2 0.0069834711 0.20613120 0.7187500
## 3 0.0010330579 0.24961260 0.4687500
## 4 0.0364071555 0.27000504 0.5333333
## 5 0.0138888889 0.05699234 0.9482759
## 6 0.0002582645 0.19521565 0.7343750
Check the FST distribution
# Create the histogram of FST values
ggplot(my_fst, aes(x=FST)) +
geom_histogram(binwidth=0.001, fill="blue", alpha=0.7) +
labs(title="Histogram of FST Values",
x="FST",
y="Frequency") +
theme_minimal()## Warning: Removed 10 rows containing non-finite values (`stat_bin()`).
Data prep: decide which SNPs to use for calibrating the null distribution of Fst
# Run OutFLANK to estimate the neutral FST distribution
out_trim <- OutFLANK(FstDataFrame = my_fst, Hmin = 0.1, NumberOfSamples = 60, qthreshold = 0.05)
str(out_trim)## List of 6
## $ FSTbar : num 0.0375
## $ FSTNoCorrbar : num 0.0797
## $ dfInferred : num 2
## $ numberLowFstOutliers : int 0
## $ numberHighFstOutliers: int 0
## $ results :'data.frame': 3011 obs. of 15 variables:
## ..$ LocusName : chr [1:3011] "AX-583034838" "AX-583043989" "AX-583045992" "AX-583046358" ...
## ..$ He : num [1:3011] 0.3589 0.4043 0.498 0.4978 0.0981 ...
## ..$ FST : num [1:3011] 0.00289 -0.00582 -0.03936 0.08818 0.1928 ...
## ..$ T1 : num [1:3011] 0.000528 -0.001193 -0.009766 0.02363 0.010905 ...
## ..$ T2 : num [1:3011] 0.1828 0.205 0.2481 0.268 0.0566 ...
## ..$ FSTNoCorr : num [1:3011] 0.04099 0.03388 0.00414 0.13484 0.2437 ...
## ..$ T1NoCorr : num [1:3011] 0.00753 0.00698 0.00103 0.03641 0.01389 ...
## ..$ T2NoCorr : num [1:3011] 0.184 0.206 0.25 0.27 0.057 ...
## ..$ meanAlleleFreq : num [1:3011] 0.234 0.719 0.469 0.533 0.948 ...
## ..$ indexOrder : int [1:3011] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ GoodH : chr [1:3011] "goodH" "goodH" "goodH" "goodH" ...
## ..$ qvalues : num [1:3011] 0.987 0.987 0.987 0.957 NA ...
## ..$ pvalues : num [1:3011] 0.804 0.692 0.101 0.368 NA ...
## ..$ pvaluesRightTail: num [1:3011] 0.598 0.654 0.949 0.184 NA ...
## ..$ OutlierFlag : logi [1:3011] FALSE FALSE FALSE FALSE FALSE FALSE ...
Check the fit and make sure it looks good, especially in the right tail:
OutFLANKResultsPlotter(
out_trim,
withOutliers = TRUE,
NoCorr = TRUE,
Hmin = 0.1,
binwidth = 0.001,
Zoom = FALSE,
RightZoomFraction = 0.05,
titletext = NULL
)## Zoom in on right tail
OutFLANKResultsPlotter(
out_trim ,
withOutliers = TRUE,
NoCorr = TRUE,
Hmin = 0.1,
binwidth = 0.001,
Zoom =
TRUE,
RightZoomFraction = 0.05,
titletext = NULL
)Also check the P-value histogram: Here, we plot the “right-tailed” P-values, which means that outliers in the right tail of the FST distribution will have a P-value near zero. Because we ran the algorithm on a trimmed set of SNPs, this will remove some of the signal around selected sites. So we expect this histogram to be flat and maybe have a bump near 0 for selected sites. This histogram looks pretty good.
Now that we’ve estimated neutral mean FST and df to a quasi-independent set of SNPs, we can go and calculate P-values and q-values for all the loci in our dataset before prunning.
Note that it is important to run this code with the uncorrected FSTs (FSTNoCorr) and the uncorrected mean FST (FSTNoCorrbar).
Now we can use the entire data set to estimate the fst
Create a vcf - ld pruned data
plink2 \
--bfile output/outflank/man_new \
--export vcf \
--make-bed \
--out output/outflank/man_new4 \
--silent;
grep "samples\|variants" output/outflank/man_new4.log ## 32 samples (11 females, 11 males, 10 ambiguous; 32 founders) loaded from
## 40985 variants loaded from output/outflank/man_new.bim.
Import the data
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 40985
## column count: 41
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 40985
## Character matrix gt cols: 41
## skip: 0
## nrows: 40985
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant: 40985
## All variants processed
# Extract genotypes
geno2 <- vcfR::extract.gt(vcf2)
# Extract locus names (positions)
locusNames2 <- vcfR::getPOS(vcf2)
# Samples
sampleNames2 <- colnames(geno2)
# Extract population names by splitting the sample names
# If your sample names contain population information.
popNames2 <- sapply(strsplit(sampleNames2, "_"), "[", 1)
# Initialize new matrix for transformed genotypes
H <- matrix(NA, nrow = nrow(geno2), ncol = ncol(geno2))
# Recode genotypes
H[geno2 %in% c("0/0", "0|0")] <- 0
H[geno2 %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
H[geno2 %in% c("1/1", "1|1")] <- 2
# Replace NA with 9 for missing data
H <- replace(H, is.na(H), 9)
# Check the counts of each genotype
table(as.vector(H))##
## 0 1 2 9
## 628890 350084 296716 35830
# Extract SNP names
snpNames <- vcfR::getID(vcf2)
# Use the option below for names if you want to make the plot without having to add the postion later on
locusNames2 <- vcfR::getPOS(vcf2)
# Calculate FST
my_fst2 <- MakeDiploidFSTMat(t(H), locusNames = locusNames2, popNames = popNames2)## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 40985"
## [1] "20000 done of 40985"
## [1] "30000 done of 40985"
## [1] "40000 done of 40985"
my_fst3 <- MakeDiploidFSTMat(t(H), locusNames = snpNames, popNames = popNames2) # with SNP id instead of position## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 40985"
## [1] "20000 done of 40985"
## [1] "30000 done of 40985"
## [1] "40000 done of 40985"
# Concatenate the SNP names and the SNP positions
locusNames3 <- paste(snpNames, locusNames2, sep = ".")
# Calculate FST with the combined names
my_fst4 <- MakeDiploidFSTMat(t(H), locusNames = locusNames3, popNames = popNames2)## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 40985"
## [1] "20000 done of 40985"
## [1] "30000 done of 40985"
## [1] "40000 done of 40985"
## LocusName He FST T1 T2 FSTNoCorr
## 1 97856 0.34963580 -0.009030940 -0.0015891675 0.17596923 1.963268e-02
## 2 305518 0.03635117 -0.021922428 -0.0004012346 0.01830247 2.097902e-02
## 3 308124 0.33944444 0.002770083 0.0004799107 0.17324777 4.488330e-02
## 4 315059 0.19482422 -0.015419420 -0.0015103306 0.09794990 1.774115e-02
## 5 315386 0.41748047 -0.034135672 -0.0070915978 0.20774742 4.948872e-05
## 6 330908 0.47530864 -0.050214091 -0.0119444444 0.23787037 1.449275e-02
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 3.472222e-03 0.17685932 0.7741935
## 2 3.858025e-04 0.01838992 0.9814815
## 3 7.812500e-03 0.17406250 0.7833333
## 4 1.745868e-03 0.09840780 0.8906250
## 5 1.033058e-05 0.20874613 0.7031250
## 6 3.472222e-03 0.23958333 0.3888889
## LocusName He FST T1 T2 FSTNoCorr
## 1 AX-581444870 0.34963580 -0.009030940 -0.0015891675 0.17596923 1.963268e-02
## 2 AX-583035083 0.03635117 -0.021922428 -0.0004012346 0.01830247 2.097902e-02
## 3 AX-583035102 0.33944444 0.002770083 0.0004799107 0.17324777 4.488330e-02
## 4 AX-583033342 0.19482422 -0.015419420 -0.0015103306 0.09794990 1.774115e-02
## 5 AX-583035163 0.41748047 -0.034135672 -0.0070915978 0.20774742 4.948872e-05
## 6 AX-583035198 0.47530864 -0.050214091 -0.0119444444 0.23787037 1.449275e-02
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 3.472222e-03 0.17685932 0.7741935
## 2 3.858025e-04 0.01838992 0.9814815
## 3 7.812500e-03 0.17406250 0.7833333
## 4 1.745868e-03 0.09840780 0.8906250
## 5 1.033058e-05 0.20874613 0.7031250
## 6 3.472222e-03 0.23958333 0.3888889
## LocusName He FST T1 T2
## 1 AX-581444870.97856 0.34963580 -0.009030940 -0.0015891675 0.17596923
## 2 AX-583035083.305518 0.03635117 -0.021922428 -0.0004012346 0.01830247
## 3 AX-583035102.308124 0.33944444 0.002770083 0.0004799107 0.17324777
## 4 AX-583033342.315059 0.19482422 -0.015419420 -0.0015103306 0.09794990
## 5 AX-583035163.315386 0.41748047 -0.034135672 -0.0070915978 0.20774742
## 6 AX-583035198.330908 0.47530864 -0.050214091 -0.0119444444 0.23787037
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq
## 1 1.963268e-02 3.472222e-03 0.17685932 0.7741935
## 2 2.097902e-02 3.858025e-04 0.01838992 0.9814815
## 3 4.488330e-02 7.812500e-03 0.17406250 0.7833333
## 4 1.774115e-02 1.745868e-03 0.09840780 0.8906250
## 5 4.948872e-05 1.033058e-05 0.20874613 0.7031250
## 6 1.449275e-02 3.472222e-03 0.23958333 0.3888889
Data checks: Heterozygosity vs. FST Here, you can see how some of the low H loci have high FST. These are all neutral loci in the simulation, and it is important to exclude them from the OutFLANK algorithm.
Note that it is important to run this code with the uncorrected FSTs (FSTNoCorr) and the uncorrected mean FST (FSTNoCorrbar).
P1 <-
pOutlierFinderChiSqNoCorr(
my_fst4,
Fstbar = out_trim$FSTNoCorrbar,
dfInferred = out_trim$dfInferred,
qthreshold = 0.05,
Hmin = 0.1
)
head(P1)## LocusName He FST T1 T2
## 1 AX-581444870.97856 0.3496358 -0.009030940 -0.0015891675 0.1759692
## 3 AX-583035102.308124 0.3394444 0.002770083 0.0004799107 0.1732478
## 4 AX-583033342.315059 0.1948242 -0.015419420 -0.0015103306 0.0979499
## 5 AX-583035163.315386 0.4174805 -0.034135672 -0.0070915978 0.2077474
## 6 AX-583035198.330908 0.4753086 -0.050214091 -0.0119444444 0.2378704
## 7 AX-583035257.442875 0.4824219 -0.022973188 -0.0055564738 0.2418678
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq pvalues
## 1 1.963268e-02 3.472222e-03 0.1768593 0.7741935 0.436596141
## 3 4.488330e-02 7.812500e-03 0.1740625 0.7833333 0.861047004
## 4 1.774115e-02 1.745868e-03 0.0984078 0.8906250 0.399055729
## 5 4.948872e-05 1.033058e-05 0.2087461 0.7031250 0.001241235
## 6 1.449275e-02 3.472222e-03 0.2395833 0.3888889 0.332470646
## 7 1.376732e-02 3.347107e-03 0.2431198 0.5937500 0.317226460
## pvaluesRightTail qvalues OutlierFlag
## 1 0.7817019 0.9672329 FALSE
## 3 0.5694765 0.9672329 FALSE
## 4 0.8004721 0.9672329 FALSE
## 5 0.9993794 0.9672329 FALSE
## 6 0.8337647 0.9672329 FALSE
## 7 0.8413868 0.9672329 FALSE
## LocusName He FST T1 T2
## 40863 AX-583020716.486728242 0.08935547 0.172731684 0.0087651515 0.05074432
## 40884 AX-583027428.487155091 NA NA NA NA
## 40907 AX-583027669.487345325 0.00000000 0.000000000 0.0000000000 0.00000000
## 40920 AX-583030779.487439328 NA NA NA NA
## 40939 AX-583032255.487617812 0.06243496 -0.006135762 -0.0001939457 0.03160907
## 40980 AX-583032854.488822384 0.09209157 -0.016510452 -0.0007738039 0.04686751
## FSTNoCorr T1NoCorr T2NoCorr meanAlleleFreq pvalues pvaluesRightTail
## 40863 0.22018349 0.011250000 0.05109375 0.9531250 NA NA
## 40884 NA NA NA NA NA NA
## 40907 0.00000000 0.000000000 0.00000000 0.0000000 NA NA
## 40920 NA NA NA NA NA NA
## 40939 0.03246073 0.001033058 0.03182485 0.9677419 NA NA
## 40980 0.04902478 0.002324380 0.04741236 0.9516129 NA NA
## qvalues OutlierFlag
## 40863 NA NA
## 40884 NA NA
## 40907 NA NA
## 40920 NA NA
## 40939 NA NA
## 40980 NA NA
# notice how the output is ordered differently
my_out <- P1$OutlierFlag==TRUE
plot(P1$He, P1$FST, pch=19, col=rgb(0,0,0,0.1))
points(P1$He[my_out], P1$FST[my_out], col="blue")# check the P-value histogram for the full set of data
# if there are outliers, it should look flat with an inflation near 0Highlight outliers on Manhattan Plot
For publication, we want to show the accurate estimate of FST, not the uncorrected estimate. Remember to exclude those low H loci!
# Use the separate() function to split the LocusName column
P1 <- P1 |>
separate(LocusName, into = c("SNP_id", "LocusName"), sep = "\\.")
# Create plot
plot(P1$LocusName[P1$He>0.1], P1$FST[P1$He>0.1],
xlab="Position", ylab="FST", col=rgb(0,0,0,0.2))
points(P1$LocusName[my_out], P1$FST[my_out], col="magenta", pch=20) No outliers for MAN and NEW
# Read data from txt files
MAN_AUT <-
read.table(
here("output", "outflank", "man_aut_SNPs_outFlank.txt"),
stringsAsFactors = FALSE
) |>
drop_na()
NEW_AUT <-
read.table(
here("output", "outflank", "new_aut_SNPs_outFlank.txt"),
stringsAsFactors = FALSE
)
NEW_MAN_AUT <-
read.table(
here("output", "outflank", "SNPs_outFlank.txt"),
stringsAsFactors = FALSE
)
# Create a list with all dataframes
list_of_clusters <- list("MAN_AUT" = MAN_AUT$V1, "NEW_AUT" = NEW_AUT$V1, "NEW_MAN_AUT" = NEW_MAN_AUT$V1)
# Create Venn diagram
venn_diagram <- ggvenn(list_of_clusters, fill_color = c("steelblue", "darkorange", "pink"))
print(venn_diagram)# Find common SNPs
common_SNPs <- Reduce(intersect, list_of_clusters)
# # Save the shared SNP ids into a txt file
write.table(
common_SNPs,
file = here("output", "pcadapt", "common_SNPs_NEW_MAN_AUT.txt"),
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
# # # Save Venn diagram to PDF
output_path <- here("output", "pcadapt", "figures", "significant_snps_NEW_MAN_AUT.pdf")
ggsave(output_path, venn_diagram, height = 5, width = 5, dpi = 300)